Import packages¶

In [1]:
# Import necessary packages for data manipulation and visualization.
import pandas as pd
import numpy as np
import pickle, os, sys  
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot
from plotly.subplots import make_subplots
import seaborn as sns

# Import 'MLcps'package
from MLcps import getCPS
calculate = getCPS.calculate  # Specify the specific function to use from the module.

Custom Functions for Generating Plots¶

In [2]:
########################
## Function to convert a 
## dictionary to a pandas DF.
########################

def dictTOdf(results):
    """
    Converts a dictionary to a pandas DataFrame for easier analysis.

    Parameters:
    results (dict): Dictionary containing the results data.

    Returns:
    df (DataFrame): Converted DataFrame with specific columns.
    """
        
    df = pd.DataFrame({'Accuracy': results["Acc"],
                   'Balanced Acc': results["Bal_acc"],
                    'F1': results["F1"],
                    'Recall': results["recall"],
                    'Precision': results["precision"],
                   'Avg precision': results["average_precision"],
                    'roc_auc': results["roc_auc"]}, index=results["model"])

    return(df.sort_values('F1')) # Return the sorted DataFrame based on the 'F1' column.

    
########################
## Function to generate 
## figures 1A, 1B, 2A, 2B, S4A
########################

def plotSTD(results_whole,fileName):
    """
    Generates a plot showing standard deviation based on the provided results.

    Parameters:
    results_whole (dict): Dictionary containing the entire results data.
    fileName (str): Name of the file to save the plot.

    Returns:
    Figure
    """
        
    # calculate MLcps
    results_whole_df=dictTOdf(results_whole)
    cpsScore=calculate(results_whole_df)
    
    df=results_whole_df
    x_data = list(df.index)
    y_data = [round(num,3) for num in list(df.mean(axis=1))]
    mlcps_score = [round(num,4) for num in list(cpsScore['Score'])]
    err_y_data=list(df.std(axis=1))


    #===================
    ##   sort for SD
    #===================
    err_y_data, y_data, x_data,mlcps_score=[list(v) for v in zip(*sorted(zip(err_y_data, y_data, x_data,mlcps_score)))]


    fig = make_subplots(rows=1, cols=1,
                        x_title='<b>Models</b>',specs=[[{"secondary_y": True}]])

    for i, bar in enumerate(err_y_data):
        fig.add_trace(go.Scatter(
                        x=[x_data[i]],
                        y=[y_data[i]],
                       #text=err_y_data,
                        mode='markers+text',
                        textposition='top center',
                        error_y=dict(
                            type='data',
                            color = "black",
                            array=[bar],
                            visible=True),
                        marker=dict(color='rgba(0,0,0,0)', size=8),
                        showlegend=False,
                    ),secondary_y=False, row=1, col=1)



    fig.add_trace(go.Scatter(x=x_data, y=mlcps_score,
                                text=np.round(mlcps_score,
                                              3),
                                mode='markers+text',
                                textposition='top right',
                                name="MLcps",
                                marker=dict(color="#C82F02", size=6),
                ),secondary_y=True)
    fig.update_yaxes(title_text="<b>Metrics Score (Standard Deviation)</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>MLcps</b>", secondary_y=True)
    fig.layout.template="plotly_white"
    #fig.write_image(fileName+".png",scale=5,width=1000)
    return(fig) 
    
    
########################
## Function to generate 
## figures 1C, 1D
########################
def plotCPS_bar_train(results_whole,fileName):
    """
    Generates bar plots for the MLcps based on training data.

    Parameters:
    results_whole (dict): Dictionary containing the entire results data.
    fileName (str): Name of the file to save the plot.

    Returns:
    Figure
    """

    # calcualte MLcps for train and merge them in a single df
    cpsScore_train=calculate(dictTOdf(results_whole))
    models = cpsScore_train['Algorithms']
    train_scores = cpsScore_train['Score']
    
    # Transpose the data for horizontal bars
    models_transposed = [ "Training"]
    scores_transposed = [ train_scores]
    
    # Define pastel colors
    pastel_colors = [ 'lightblue']
    
    # Create a horizontal bar plot to visualize training and test scores for each model
    fig = go.Figure()
    
    for i, model in enumerate(models_transposed):
        # Sort the models based on scores and calculate rankings
        scores = scores_transposed[i]
        
        def custom_sort_key(index):
           return (scores[index], index)
       
        sorted_indices = sorted(range(len(scores)), key=custom_sort_key, reverse=True)
        rankings = [sorted_indices.index(j) + 1 for j in range(len(scores))]
    
        # Format the scores with up to three decimal places
        formatted_scores = [f"{score:.6f}" for score in scores]
    
        # Create custom HTML labels with colored rank text
        labels = [f"{formatted_scores[j]}  ( Rank  <b><span  ';'>{rank}</span> </b>)" for j, rank in enumerate(rankings)]

        # Add bar trace
        bar_trace = go.Bar(
            y=models,
            x=scores_transposed[i],
            text=labels,
            legendgroup=model,
            orientation='h',
            name=model,
            textposition="auto",
            marker=dict(color=pastel_colors[i],line=dict(width=0.8, color='black'))
        )
        fig.add_trace(bar_trace)
    
    # Customize the layout
    fig.update_layout(
        title=None,
        xaxis_title="<b>MLcps</b>",
        yaxis_title="<b>Models</b>",
        barmode='group' ,
        template="plotly_white",
        legend_traceorder="reversed",
        legend=dict(x=0.95,  y=1),
        
        width=600,height=900,
        yaxis=dict(title_standoff=0),
        font=dict(size=16)
    )
    fig.update_layout(yaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    fig.update_layout(xaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    #fig.write_image(fileName+".png",scale=5)
    return(fig) 
    


########################
## Function to generate 
## figures 2C, 2D, S4B
########################
def plotCPS_bar_test_train(results_whole,results_test,fileName):
    
    """
    Generates bar plots for the MLcps based on both test and training data.

    Parameters:
    results_whole (dict): Dictionary containing the entire results data for training.
    results_test (dict): Dictionary containing the entire results data for testing.
    fileName (str): Name of the file to save the plot.

    Returns:
    Figure
    """
        
    # calcualte MLcps for train and test results 
    # and merge them in a single df
    cpsScore_train=calculate(dictTOdf(results_whole))
    cpsScore_test=calculate(dictTOdf(results_test))
    merged_df = cpsScore_train.merge(cpsScore_test, on='Algorithms', how='inner')  # You can use 'how' to specify the type of join
     
    # Sample data 
    models = merged_df['Algorithms']
    train_scores = merged_df['Score_x']
    test_scores = merged_df['Score_y']
    
    # Transpose the data for horizontal bars
    models_transposed = [ "Test","Training"]
    scores_transposed = [ test_scores,train_scores]
    
    # Define colors
    pastel_colors = ['lightcoral', 'lightblue']
    
    # Create a horizontal bar plot to visualize training and test scores for each model
    fig = go.Figure()
    
    for i, model in enumerate(models_transposed):
        # Sort the models based on scores and calculate rankings
        scores = scores_transposed[i]
        
        def custom_sort_key(index):
           return (scores[index], index)
       
        sorted_indices = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)
        rankings = [sorted_indices.index(j) + 1 for j in range(len(scores))]
    
        # Format the scores with up to three decimal places
        formatted_scores = [f"{score:.6f}" for score in scores]
    
        # Create custom HTML labels with colored rank text
        labels = [f"{formatted_scores[j]}  ( Rank  <b><span  ';'>{rank}</span> </b>)" for j, rank in enumerate(rankings)]
    
        # Add bar trace
        bar_trace = go.Bar(
            y=models,
            x=scores_transposed[i],
            text=labels,
            legendgroup=model,
            textposition='auto',
            orientation='h',
            name=model,
            marker=dict(color=pastel_colors[i],line=dict(width=0.8, color='black'))
        )
        fig.add_trace(bar_trace)
    
    
    # Customize the layout
    fig.update_layout(
        title=None,
        xaxis_title="<b>MLcps</b>",
        yaxis_title="<b>Models</b>",
        barmode='group' ,
        template="plotly_white",
        legend_traceorder="reversed",
        legend=dict(x=0.95,  y=1),
        
        width=600,height=900,
        yaxis=dict(title_standoff=0),
        font=dict(size=16)
    )
    fig.update_layout(yaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    fig.update_layout(xaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    #fig.write_image(fileName+".png",scale=5)
    return(fig) 
    

########################
## Function to generate 
## figure S4A
########################

def bar_new(results_whole,results_test,fileName):
    
    """
    Generates bar plot for the body signal dataset.

    Parameters:
    results_whole (dict): Dictionary containing the entire results data.
    results_test (dict): Dictionary containing the test results data.
    fileName (str): Name of the file to save the plot.

    Returns:
    Figure
    """
        
    results_whole.index = results_whole.index.str.split('-').str[0]
    results_test.index = results_test.index.str.split('-').str[0]
    
    # Line plot
    cpsScore_train=calculate(results_whole)
    cpsScore_test=calculate(results_test)
    merged_df = cpsScore_train.merge(cpsScore_test, on='Algorithms', how='inner')  # You can use 'how' to specify the type of join
    
    
    # Sample data 
    models = merged_df['Algorithms']
    train_scores = merged_df['Score_x']
    test_scores = merged_df['Score_y']
    
    # Transpose the data for horizontal bars
    models_transposed = [ "Test","Training"]
    scores_transposed = [ test_scores,train_scores]
    
    
    # Define pastel colors
    pastel_colors = ['lightcoral', 'lightblue']
    
    # Create a horizontal bar plot to visualize training and test scores for each model
    fig = go.Figure()
    
    for i, model in enumerate(models_transposed):
        # Sort the models based on scores and calculate rankings
        scores = scores_transposed[i]
        
        def custom_sort_key(index):
           return (scores[index], index)
       
        #sorted_indices = sorted(range(len(scores)), key=custom_sort_key, reverse=True)
        sorted_indices = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)
        rankings = [sorted_indices.index(j) + 1 for j in range(len(scores))]
    
        # Format the scores with up to three decimal places
        formatted_scores = [f"{score:.6f}" for score in scores]
    
        # Create custom HTML labels with colored rank text
        labels = [f"{formatted_scores[j]}  ( Rank  <b><span  ';'>{rank}</span> </b>)" for j, rank in enumerate(rankings)]
    
        # Add bar trace
        bar_trace = go.Bar(
            y=models,
            x=scores_transposed[i],
            text=labels,
            legendgroup=model,
            textposition='auto',
            orientation='h',
            name=model,
            marker=dict(color=pastel_colors[i],line=dict(width=0.8, color='black'))
        )
        fig.add_trace(bar_trace)

    # Customize the layout
    fig.update_layout(
        title=None,
        xaxis_title="<b>MLcps</b>",
        yaxis_title="<b>Models</b>",
        barmode='group' ,
        template="plotly_white",
        legend_traceorder="reversed",
        legend=dict(x=0.95,  y=1),
        
        width=700,height=900,
        yaxis=dict(title_standoff=0),
        font=dict(size=16)
    )
    fig.update_layout(yaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    fig.update_layout(xaxis = dict(tickfont = dict(size=18),title_font=dict(size=18)))
    #fig.write_image(fileName+".png",scale=5)
    return(fig) 
    

########################
## Function to generate 
## figure S4B
########################
def stdPlotNew(results_whole_df,fileName):
    
    """
    Generates a standard deviation plot for the body signal dataset.

    Parameters:
    results_whole_df (DataFrame): DataFrame containing the entire results data.
    fileName (str): Name of the file to save the plot.

    Returns:
    Figure
    """
        
    results_whole_df.index = results_whole_df.index.str.split('-').str[0]
    results_whole_df=results_whole_df.sort_values('f1')
    
    df=results_whole_df
    x_data = list(df.index)
    y_data = [round(num,3) for num in list(df.mean(axis=1))]
    err_y_data=list(df.std(axis=1))
    
    
    cpsScore=calculate(results_whole_df)
    mlcps_score = []
    for m in x_data:
        num = cpsScore[cpsScore['Algorithms'] == m]['Score'].values[0]
        mlcps_score.append(round(num,4))
    
    #===============
    ##  sort for SD
    #===============
    err_y_data, y_data, x_data,mlcps_score=[list(v) for v in zip(*sorted(zip(err_y_data, y_data, x_data,mlcps_score), key=lambda x: x[0], reverse=True))]
    
    
    fig = make_subplots(rows=1, cols=1,
                        x_title='<b>Models</b>',specs=[[{"secondary_y": True}]])
    
    for i, bar in enumerate(err_y_data):
        fig.add_trace(go.Scatter(
                        x=[x_data[i]],
                        y=[y_data[i]],
                       #text=y_data[i],
                        mode='markers+text',
                        textposition='top center',
                        error_y=dict(
                            type='data',
                            color = "black",
                            array=[bar],
                            visible=True),
                        marker=dict(color='rgba(0,0,0,0)', size=8),
                        showlegend=False,
                    ),secondary_y=False, row=1, col=1)
    
    
    
    fig.add_trace(go.Scatter(x=x_data, y=mlcps_score,
                                text=np.round(mlcps_score,
                                              3),
                                mode='markers+text',
                                textposition='top right',
                                name="MLcps",
                                marker=dict(color="#C82F02", size=6),
                ),secondary_y=True)
    fig.update_yaxes(title_text="<b>Metrics Score (Standard Deviation)</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>MLcps</b>", secondary_y=True)
    fig.layout.template="plotly_white"
    #fig.write_image(fileName+".png",scale=5,width=1100)
    return(fig)    
    

Results Folder¶

In [3]:
# Specify the path to the results folder downloaded from GitHub. 
#resultsFolder='/path/to/results/folder'
resultsFolder='/Users/akshay/Desktop/prof_katia_plots/pain-paper/MLcps-paper/MLcps/generateManuPlots/results'

1. CLL Dataset¶

Setup¶

In [4]:
# change directory to the CLL results folder.
# All the plots will be saved here.
os.chdir(os.path.join(resultsFolder,"CLL"))

# load dataset
results_whole = pickle.load(open("results_whole.pickle",'rb'))

Figure 1A¶

In [5]:
plotSTD(results_whole,"std")

Figure 1C¶

In [6]:
plotCPS_bar_train(results_whole,"cpsScore_cll")

2. Cervical Dataset¶

Setup¶

In [7]:
# change directory to the cervical results folder.
# All the plots will be saved here.
os.chdir(os.path.join(resultsFolder,"cervical"))

# load dataset
results_whole = pickle.load(open("results_whole.pickle",'rb'))

Figure 1B¶

In [8]:
plotSTD(results_whole,"std")

Figure 1D¶

In [9]:
plotCPS_bar_train(results_whole,"cpsScore_cervical")

3. TCGA miRNA Dataset¶

Setup¶

In [10]:
# change directory to the TCGA miRNA results folder.
# All the plots will be saved here.
os.chdir(os.path.join(resultsFolder,"TCGA-BRCA_miRNA"))

# load dataset
results_whole = pickle.load(open("results_whole.pickle",'rb'))
results_test = pickle.load(open("results_test.pickle",'rb'))

Figure 2A¶

In [11]:
plotSTD(results_whole,"std")

Figure 2C¶

In [12]:
plotCPS_bar_test_train(results_whole,results_test,"cpsScore_tcga_miRNA")

4. TCGA mRNA Dataset¶

Setup¶

In [13]:
# change directory to the TCGA mRNA results folder.
# All the plots will be saved here.
os.chdir(os.path.join(resultsFolder,"TCGA-BRCA_mRNA"))

# load dataset
results_whole = pickle.load(open("results_whole.pickle",'rb'))
results_test = pickle.load(open("results_test.pickle",'rb'))

Figure 2B¶

In [14]:
plotSTD(results_whole,"std")

Figure 2D¶

In [15]:
plotCPS_bar_test_train(results_whole,results_test,"cpsScore_tcga_mRNA")

5. Body Signal Dataset¶

Setup¶

In [16]:
# change directory to the Body Signal results folder.
# All the plots will be saved here.
os.chdir(os.path.join(resultsFolder,"body_signal"))

# load dataset
train=pd.read_excel("train.xlsx",index_col=0)
test=pd.read_excel("test.xlsx",index_col=0)

Figure S4A¶

In [17]:
stdPlotNew(train,"std")

Figure S4B¶

In [18]:
# Figure S4B
bar_new(train,test,"cpsScore_body_signal")